Load required libraries.

library(ggplot2)
library(dplyr)
library(tidyr)
library(purrr)
library(grid)
library(wordbankr)
library(langcog)
theme_set(theme_mikabr())

Load in Wordbank data.

items <- get_item_data() %>%
  filter(type == "word") %>%
  mutate(num_item_id = as.numeric(substr(item_id, 6, nchar(item_id))))

Function for getting vocabulary composition data for a given instrument.

get_vocab_comp <- function(input_language, input_form) {
  
  lang_vocab_items <- filter(items, language == input_language, form == input_form) %>%
    filter(lexical_category %in% c("nouns", "predicates", "function_words"))
  
  lang_vocab_data <- get_instrument_data(instrument_language = input_language,
                                         instrument_form = input_form,
                                         items = lang_vocab_items$item_id,
                                         iteminfo = lang_vocab_items) %>%
    mutate(value = ifelse(is.na(value), "", value),
           produces = value == "produces",
           understands = value == "produces" | value == "understands") %>%
    select(-value) %>%
    gather(measure, value, produces, understands)
  
  num_words <- nrow(lang_vocab_items)
  
  lang_vocab_summary <- lang_vocab_data %>%
    group_by(data_id, measure, lexical_category) %>%
    summarise(num_true = sum(value),
              prop = sum(value) / n())
  
  lang_vocab_sizes <- lang_vocab_summary %>%
    summarise(vocab_num = sum(num_true),
              vocab = sum(num_true) / num_words)
  
  lang_vocab_summary %>%
    left_join(lang_vocab_sizes) %>%
    mutate(prop_vocab = num_true / vocab_num) %>%
    select(-num_true) %>%
    mutate(language = input_language, form = input_form)
  
}

Get vocabulary composition data for all instruments.

instruments <- items %>%
  select(language, form) %>%
  distinct()

vocab_comp_data <- map2(instruments$language, instruments$form, get_vocab_comp) %>%
  bind_rows()

Show sample size of each instrument.

sample_sizes <- vocab_comp_data %>%
  group_by(language, form, measure, lexical_category) %>%
  summarise(n = n()) %>%
  ungroup() %>%
  select(language, form, n) %>%
  distinct()
kable(sample_sizes)
language form n
British Sign Language WG 161
Cantonese WS 987
Croatian WG 250
Croatian WS 377
Danish WS 3714
English WG 2454
English WS 5824
German WS 1183
Hebrew WG 62
Hebrew WS 253
Italian WG 648
Italian WS 752
Mandarin TC 652
Mandarin WS 1056
Norwegian WG 3025
Norwegian WS 12969
Russian WG 768
Russian WS 1037
Spanish WG 778
Spanish WS 1094
Swedish WG 474
Swedish WS 900
Turkish WG 1115
Turkish WS 2422

Base plot for looking at vocabulary composition.

base_plot <- function(input_form, input_measure) {
  vocab_comp_data %>%
    filter(form == input_form, measure == input_measure) %>%
    mutate(lexical_category = factor(lexical_category,
                                     levels = c("nouns", "predicates", "function_words"),
                                     labels = c("Nouns  ", "Predicates  ", "Function Words"))) %>%
    ggplot(aes(x = vocab, y = prop, colour = lexical_category)) +
    facet_wrap(~language) +
    geom_abline(slope = 1, intercept = 0, color = "gray", linetype = "dashed") + 
    scale_y_continuous(limits = c(0, 1), breaks = seq(0, 1, 0.2),
                       name = "Proportion of Category\n") +
    scale_x_continuous(limits = c(0, 1), breaks = seq(0, 1, 0.2),
                       name = "\nVocabulary Size") +
    scale_colour_solarized(name = "") +
    theme(legend.position = "top",
          legend.key = element_blank(),
          legend.background = element_rect(fill = "transparent"))
}

Plot WS productive vocabulary composition as a function of vocabulary size for each language.

base_plot("WS", "produces") + geom_jitter(size = 0.7)

Plot WG productive vocabulary composition as a function of vocabulary size for each language.

base_plot("WG", "produces") + geom_jitter(size = 0.7)

Plot WG receptive vocabulary composition as a function of vocabulary size for each language.

base_plot("WG", "understands") + geom_jitter(size = 0.7)

Plot WS productive vocabulary composition as a function of vocabulary size for each language with cubic contrained lm curves.

base_plot("WS", "produces") +
  geom_smooth(method = "clm", formula = y ~ I(x ^ 3) + I(x ^ 2) + x - 1)

Plot WG productive vocabulary composition as a function of vocabulary size for each language with cubic contrained lm curves.

base_plot("WG", "produces") +
  geom_smooth(method = "clm", formula = y ~ I(x ^ 3) + I(x ^ 2) + x - 1)

Plot WG receptive vocabulary composition as a function of vocabulary size for each language with cubic contrained lm curves.

base_plot("WG", "understands") +
  geom_smooth(method = "clm", formula = y ~ I(x ^ 3) + I(x ^ 2) + x - 1)

Function for resampling data and computing area estimate for each sample.

sample_areas <- function(d, nboot = 1000) {
  
  poly_area <- function(group_data) {
    model = clm(prop ~ I(vocab ^ 3) + I(vocab ^ 2) + vocab - 1,
                data = group_data)
    return((model$solution %*% c(1/4, 1/3, 1/2) - 0.5)[1])
  }
  
  counter <- 1
  sample_area <- function(d) {
    d_frame <- d %>%
      group_by(language, form, measure) %>%
      sample_frac(replace = TRUE) %>%
      group_by(language, form, measure, lexical_category) %>%
      do(area = poly_area(.)) %>%
      mutate(area = area[1]) %>%
      rename_(.dots = setNames("area", counter))
    
    counter <<- counter + 1 # increment counter outside scope
    return(d_frame)
  }
  
  areas <- replicate(nboot, sample_area(d), simplify = FALSE)
  
  Reduce(left_join, areas) %>%
    gather(sample, area, -language, -form, -measure, -lexical_category)
}

Resample data and find the mean and CI of the area estimate.

areas <- sample_areas(vocab_comp_data, 1000)

area_summary <- areas %>%
  group_by(language, form, measure, lexical_category) %>%
  summarise(mean =  mean(area),
            ci_lower = ci_lower(area),
            ci_upper = ci_upper(area)) %>%
  ungroup() %>%
  mutate(language = factor(language),
         instrument = paste(language, form))

area_order <- filter(area_summary, form == "WS", measure == "produces",
                     lexical_category == "nouns")
language_levels <- area_order$language[order(area_order$mean,
                                             area_order$language,
                                             decreasing = FALSE)]

area_summary_ordered <- area_summary %>%
  filter(form %in% c("WS", "WG"),
         !(form == "WS" & measure == "understands")) %>%
  ungroup() %>%
  mutate(language = factor(language, levels = language_levels),
         lexical_category = factor(lexical_category,
                                   levels = c("nouns", "predicates", "function_words"),
                                   labels = c("Nouns", "Predicates", "Function Words")))

Plot each lexical category’s area estimate by language, form, and measure.

ggplot(area_summary_ordered,
       aes(y = language, x = mean, colour = lexical_category)) +
  facet_grid(lexical_category ~ form + measure) +
  geom_point() +
  geom_segment(aes(x = ci_lower, xend = ci_upper,
                   y = language, yend = language)) +
  geom_vline(xintercept = 0, linetype = "dashed", color = "gray") + 
  scale_colour_solarized(name = "", guide = FALSE) +
  scale_y_discrete(name = "", limits = levels(area_summary_ordered$language)) +
  xlab("\nRelative representation in early vocabulary")

Plots for BUCLD poster

Data and models.

sample_sizes_ordered <- sample_sizes %>%
  filter(form == "WS") %>%
  mutate(language = factor(language, levels = rev(language_levels)))

language_size_order <- unlist(map(
  levels(sample_sizes_ordered$language),
  function(language) sprintf("%s (N = %s)", language,
                             sample_sizes_ordered$n[which(sample_sizes_ordered$language == language)])
))

vocab_comp_data_ordered <- vocab_comp_data %>%
  filter(form == "WS", measure == "produces") %>%
  mutate(language = factor(language, levels = rev(language_levels)),
         lexical_category = factor(lexical_category,
                                   levels = c("nouns", "predicates", "function_words"),
                                   labels = c("Nouns  ", "Predicates  ", "Function Words"))) %>%
  left_join(sample_sizes_ordered) %>%
  mutate(language_with_size = sprintf("%s (N = %s)", language, n),
         language_with_size = factor(language_with_size, levels = language_size_order))

ggplot(vocab_comp_data_ordered, aes(x = vocab, y = prop, colour = lexical_category)) +
  facet_wrap(~language_with_size, ncol = 5) +
  geom_jitter(size = 0.7, alpha = 0.5) +
  geom_smooth(method = "clm", formula = y ~ I(x ^ 3) + I(x ^ 2) + x - 1, size = 1, se = FALSE) +
  geom_abline(slope = 1, intercept = 0, color = "gray", linetype = "dashed") + 
  scale_y_continuous(limits = c(0, 1), breaks = seq(0, 1, 0.2),
                     name = "Proportion of Category\n") +
  scale_x_continuous(limits = c(0, 1), breaks = seq(0, 1, 0.2),
                     name = "\nVocabulary Size") +
  scale_colour_solarized(name = "") +
  theme_mikabr(base_size = 20) +
  theme(legend.position = "top",
        legend.key = element_blank(),
        legend.background = element_rect(fill = "transparent"))

#ggsave("BUCLD/data_models.png", width = 15, height = 10)

Demo of area estimation.

demo_langs <- c("English", "Mandarin")
demo_data <- filter(vocab_comp_data, form == "WS", language %in% demo_langs) %>%
  mutate(panel = paste(language, "(data)"),
         lexical_category = factor(lexical_category,
                                   levels = c("nouns", "predicates", "function_words"),
                                   labels = c("Nouns", "Predicates", "Function Words")))
pts <- seq(0, 1, 0.01)

models <- demo_data %>%
  group_by(language, lexical_category) %>%
  do(model = clm(prop ~ I(vocab ^ 3) + I(vocab ^ 2) + vocab - 1, data = .))

get_lang_lexcat_predictions <- function(lang, lexcat) {
  model <- filter(models, language == lang, lexical_category == lexcat)$model[[1]]
  data.frame(vocab = pts,
             prop = predict(model, newdata = data.frame(vocab = pts)),
             lexical_category = lexcat,
             language = lang)
}

get_lang_predictions <- function(lang) {
  bind_rows(sapply(unique(demo_data$lexical_category),
                   function(lexcat) get_lang_lexcat_predictions(lang, lexcat),
                   simplify = FALSE))
}

predictions <- bind_rows(sapply(demo_langs, get_lang_predictions, simplify = FALSE))

diagonal <- expand.grid(vocab = rep(rev(pts)),
                        language = demo_langs,
                        lexical_category = unique(demo_data$lexical_category))
diagonal$prop <- diagonal$vocab

area_poly <- bind_rows(predictions, diagonal) %>%
  mutate(panel = paste(language, "(models)"))

ggplot(filter(predictions, language == "English"), aes(x = vocab, y = prop)) +
  facet_grid(. ~ lexical_category) +
  geom_line(aes(colour = lexical_category), size = 1) +
  geom_polygon(data = filter(area_poly, language == "English"),
               aes(fill = lexical_category), alpha = 0.2) +
  geom_abline(slope = 1, intercept = 0, color = "gray", linetype = "dashed") + 
  scale_y_continuous(limits = c(0, 1), breaks = c(),
                     name = "") +
  scale_x_continuous(limits = c(0, 1), breaks = c(),
                     name = "") +
  scale_colour_solarized(guide = FALSE) +
  scale_fill_solarized(guide = FALSE) +
  theme(legend.position = c(0.061, 0.91),
        legend.text = element_text(size = 8),
        legend.key.height = unit(0.9, "char"),
        legend.key.width = unit(0.88, "char"),
        legend.background = element_rect(fill = "transparent"),
        strip.background = element_blank(),
        strip.text.x = element_blank())

#ggsave("BUCLD/area_demo.png", width = 9, height = 3.5)

WS area estimates.

ggplot(filter(area_summary_ordered, form == "WS"),
       aes(y = language, x = mean, col = lexical_category)) +
  facet_grid(lexical_category ~ .) +
  geom_point() +
  geom_segment(aes(x = ci_lower, xend = ci_upper,
                   y = language, yend = language)) +
  geom_vline(xintercept = 0, linetype = "dashed", color = "gray") + 
  scale_colour_solarized(name = "", guide = FALSE) +
  scale_y_discrete(name = "", limits = levels(area_summary_ordered$language)) +
  xlab("\nRelative representation in early vocabulary") +
  theme_mikabr(base_size = 20)

#ggsave("BUCLD/diffs.png", width = 8, height = 11)